x
import numpy as np; #importing numerical libimport pandas as pd; #handling data in form of row and columnsimport seaborn as sns; #importing seaborn for statstical plotsimport matplotlib.pyplot as plt; #importing ploting liabries#styling figuresplt.rc('font',size=14)sns.set(style='white')sns.set(style='whitegrid',color_codes=True)import warningswarnings.filterwarnings('ignore')#Import Sklearn package's data splitting function which is based on random functionfrom sklearn.model_selection import train_test_splitfrom sklearn.decomposition import PCAfrom sklearn.svm import SVCfrom sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_scorefrom scipy.stats import zscorevehdf=pd.read_csv('vehicle.csv')vehdf.head(10)vehdf.tail(10)from sklearn.preprocessing import LabelEncoder, OneHotEncoderle = LabelEncoder()columns = vehdf.columnsprint(columns)vehdf['class'] = le.fit_transform(vehdf['class'])print(vehdf.shape)#To get information of null in columns and total entries per columnvehdf.info()xxxxxxxxxx 1.compactness, max.length_aspect_ratio,max.length_rectangularity, hollows_ratio, class has no missing values rest all features are having some kind of missing values 2.All columns has numeric values1.compactness, max.length_aspect_ratio,max.length_rectangularity, hollows_ratio, class has no missing values rest all features are having some kind of missing values
2.All columns has numeric valuesfrom sklearn.impute import SimpleImputer #to impute missing valuesnewdf = vehdf.copy()X = newdf.iloc[:,0:19] #separting all numercial independent attributeimputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1) #fill missing values with median column valuestransformed_values = imputer.fit_transform(X)column = X.columnsprint(column)newdf = pd.DataFrame(transformed_values, columns = column )newdf.describe()print('Original null values count\n', vehdf.isnull().sum())print("\n\nCount after we imputed the NaN value:\n", newdf.isnull().sum())xxxxxxxxxx##### Observation: we can see that the missing NaN values from our orginal vehdf datframe columns are treated and replaced using median strategy.we can see that the missing NaN values from our orginal vehdf datframe columns are treated and replaced using median strategy.
xxxxxxxxxx## Descriptive statistical summarydescribe() Function gives the mean, std and IQR values. It excludes character column and calculate summary statistics only for numeric columns.describe() Function gives the mean, std and IQR values. It excludes character column and calculate summary statistics only for numeric columns.
newdf.describe().T##### ObservationCompactness, circularity has mean and median values almost similar , it signifies that it is normally distributed and has no skewness/outlierCompactness, circularity has mean and median values almost similar , it signifies that it is normally distributed and has no skewness/outlier
x
#plt.style.use('seaborn-whitegrid')newdf.hist(bins=20, figsize=(80,60),color='lightblue',edgecolor='red', xlabelsize=18)plt.show()#### Observation:Most of the data attributes seems to be normally distributedscaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skwed .Most of the data attributes seems to be normally distributed scaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skwed .
fig, ax=plt.subplots(1,6,figsize=(30,5))vis1 = sns.distplot(newdf["scaled_variance.1"],bins=10, ax= ax[0])vis2 = sns.distplot(newdf["scaled_variance"],bins=10, ax=ax[1])vis3 = sns.distplot(newdf["skewness_about.1"],bins=10, ax= ax[2])vis4 = sns.distplot(newdf["skewness_about"],bins=10, ax=ax[3])vis6 = sns.distplot(newdf["scatter_ratio"],bins=10, ax=ax[5])fig.savefig('subplot.png')skewValue = newdf.skew()print("skewValue of dataframe attributes:\n", skewValue)#Summary View of all attribute , The we will look into all the boxplot individually to trace out outliersfig, ax=plt.subplots(1,1,figsize=(12,8))ax = sns.boxplot(data=newdf, orient="h")x
plt.figure(figsize= (20,15))plt.subplot(3,3,1)sns.boxplot(x= newdf['pr.axis_aspect_ratio'], color='blue')plt.subplot(3,3,2)sns.boxplot(x= newdf.skewness_about, color='pink')plt.subplot(3,3,3)sns.boxplot(x= newdf.scaled_variance, color='red')plt.show()x
plt.figure(figsize= (20,15))plt.subplot(3,3,1)sns.boxplot(x= newdf['radius_ratio'], color='lightblue')plt.subplot(3,3,2)sns.boxplot(x= newdf['scaled_radius_of_gyration.1'], color='orange')plt.subplot(3,3,3)sns.boxplot(x= newdf['scaled_variance.1'], color='green')plt.show()x
plt.figure(figsize= (20,15))plt.subplot(3,3,1)sns.boxplot(x= newdf['max.length_aspect_ratio'], color='purple')plt.subplot(3,3,2)sns.boxplot(x= newdf['skewness_about.1'], color='grey')plt.show()xxxxxxxxxx#### Observation on boxplots:pr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1,scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about, scaled_variance.1 are some of the attributes with outlierspr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1, scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about, scaled_variance.1 are some of the attributes with outliers
x
newdf.shapefrom scipy.stats import iqrQ1 = newdf.quantile(0.25)Q3 = newdf.quantile(0.75)IQR = Q3 - Q1print(IQR)cleandf = newdf[~((newdf < (Q1 - 1.5 * IQR)) |(newdf > (Q3 + 1.5 * IQR))).any(axis=1)]cleandf.shapex
plt.figure(figsize= (20,15))plt.subplot(8,8,1)sns.boxplot(x= cleandf['pr.axis_aspect_ratio'], color='blue')plt.subplot(8,8,2)sns.boxplot(x= cleandf.skewness_about, color='pink')plt.subplot(8,8,3)sns.boxplot(x= cleandf.scaled_variance, color='red')plt.subplot(8,8,4)sns.boxplot(x= cleandf['radius_ratio'], color='lightblue')plt.subplot(8,8,5)sns.boxplot(x= cleandf['scaled_radius_of_gyration.1'], color='orange')plt.subplot(8,8,6)sns.boxplot(x= cleandf['scaled_variance.1'], color='green')plt.subplot(8,8,7)sns.boxplot(x= cleandf['max.length_aspect_ratio'], color='purple')plt.subplot(8,8,8)sns.boxplot(x= cleandf['skewness_about.1'], color='gray')plt.show()xxxxxxxxxx#### NoteWe can see that all out boxplot for all the attributes which had outlier have been treated and removed. Since no. of outliers were less we opted to remove it. Generally we avoid this as it can lead to info loss in case of large data sets with large no of outliersWe can see that all out boxplot for all the attributes which had outlier have been treated and removed. Since no. of outliers were less we opted to remove it. Generally we avoid this as it can lead to info loss in case of large data sets with large no of outliers
## Understanding the relationship between all independent attribute:xxxxxxxxxxData Correlation: Is a way to understand the relationship between multiple variables and attributes in your dataset. Using Correlation, you can get some insights such as:One or multiple attributes depend on another attribute or a cause for another attribute.One or multiple attributes are associated with other attributes.Spearman and Pearson are two statistical methods to calculate the strength of correlation between two variables or attributes. Pearson Correlation Coefficient can be used with continuous variables that have a linear relationship.Data Correlation: Is a way to understand the relationship between multiple variables and attributes in your dataset. Using Correlation, you can get some insights such as:
One or multiple attributes depend on another attribute or a cause for another attribute.
One or multiple attributes are associated with other attributes.
Spearman and Pearson are two statistical methods to calculate the strength of correlation between two variables or attributes. Pearson Correlation Coefficient can be used with continuous variables that have a linear relationship.
def correlation_heatmap(dataframe,l,w): correlation = dataframe.corr() plt.figure(figsize=(l,w)) sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='viridis') plt.title('Correlation between different fearures') plt.show(); # Let's Drop Class column and see the correlation Matrix & Pairplot Before using this dataframe for PCA as PCA should only be perfromed on independent attributecleandf= newdf.drop('class', axis=1)#print("After Dropping: ", cleandf)correlation_heatmap(cleandf, 30,15)xxxxxxxxxx#### Observation##### Strong/fare Correlation - Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98 - skewness_about_2 and hollow_ratio seems to be strongly correlated, corr coeff: 0.89 - ditance_circularity and radius_ratio seems to have high positive correlation with corr coeff: 0.81 - compactness & circularity , radius_ratio & pr.axis_aspect_ratio also seems ver averagely correlated with coeff: 0.67. - scaled _variance and scaled_radius_of_gyration, circularity & distance_circularity also seems to be highly correlated with corr coeff: 0.79 - pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with coeff: 0.81 - scatter_ratio and elongatedness seems to be have strong negative correlation val : 0.97 - elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val: 0.95 ##### Little/Poor Correlation -max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.5 - pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation - scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated - scaled_radius_gyration.1 & skewness_about seems to be very little correlated - skewness_about & skewness_about.1 not be correlated - skewness_about.1 and skewness_about.2 are not correlated. - Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98
- skewness_about_2 and hollow_ratio seems to be strongly correlated, corr coeff: 0.89
- ditance_circularity and radius_ratio seems to have high positive correlation with corr coeff: 0.81
- compactness & circularity , radius_ratio & pr.axis_aspect_ratio also seems ver averagely correlated with coeff: 0.67.
- scaled _variance and scaled_radius_of_gyration, circularity & distance_circularity also seems to be highly correlated with corr coeff: 0.79
- pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with coeff: 0.81
- scatter_ratio and elongatedness seems to be have strong negative correlation val : 0.97
- elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val: 0.95 -max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.5
- pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation
- scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated
- scaled_radius_gyration.1 & skewness_about seems to be very little correlated
- skewness_about & skewness_about.1 not be correlated
- skewness_about.1 and skewness_about.2 are not correlated.sns.pairplot(cleandf, diag_kind="kde")##### Quick insights:As observed in our correlation heatmap our pairplot seems to validate the same. Scaled Variance & Scaled Variance.1 seems to be have very strong positive correlation with value of 0.98. skewness_about_2 and hollow_ratio also seems to have strong positive correation with coeff: 0.89scatter_ratio and elongatedness seems to be have very strong negative correlation. elongatedness and pr.axis_rectangularity seems to have strong negative correlation As observed in our correlation heatmap our pairplot seems to validate the same. Scaled Variance & Scaled Variance.1 seems to be have very strong positive correlation with value of 0.98. skewness_about_2 and hollow_ratio also seems to have strong positive correation with coeff: 0.89
scatter_ratio and elongatedness seems to be have very strong negative correlation. elongatedness and pr.axis_rectangularity seems to have strong negative correlation
## Choosing the right attributes which can be the right choice for model buildingFrom above correlation matrix we can see that there are many features which are highly correlated. if we carefully analyse, we will find that many features are there which having more than 0.9 correlation. so we can decide to get rid of those columns whose correlation is +-0.9 or above.There are 8 such columns:max.length_rectangularityscaled_radius_of_gyrationskewness_about.2scatter_ratioelongatednesspr.axis_rectangularityscaled_variancescaled_variance.1Type Markdown and LaTeX:
#display how many are car,bus,van. newdf['class'].value_counts()splitscaledf = newdf.copy()sns.countplot(newdf['class'])plt.show()xxxxxxxxxxBasically PCA is a dimension redcuction methodology which aims to reduce a large set of (often correlated) variables into a smaller set of (uncorrelated) variables, called principal components, which holds sufficient information without loosing the the relevant info much.Basically PCA is a dimension redcuction methodology which aims to reduce a large set of (often correlated) variables into a smaller set of (uncorrelated) variables, called principal components, which holds sufficient information without loosing the the relevant info much.
#### Separate The Data Into Independent & Dependent attribute#X1= newdf.drop('class',axis=1)#y1 = newdf['class']#print("shape of new_vehicle_df_independent_attr::",X.shape)#print("shape of new_vehicle_df_dependent_attr::",y.shape)X = newdf.iloc[:,0:18].valuesy = newdf.iloc[:,18].valuesXfrom sklearn.preprocessing import StandardScaler#We transform (centralize) the entire X (independent variable data) to normalize it using standardscalar through transformation. We will create the PCA dimensions# on this distribution. sc = StandardScaler()X_std = sc.fit_transform(X) ### Calculating covariance matrix:Covariance matrix should be 18*18 matrixType Markdown and LaTeX:
cov_matrix = np.cov(X_std.T)print("cov_matrix shape:",cov_matrix.shape)print("Covariance_matrix",cov_matrix)xxxxxxxxxx#### Calculating Eigen Vectors & Eigen Values: Using numpy linear algebra functioneigenvalues, eigenvectors = np.linalg.eig(cov_matrix)print('Eigen Vectors \n%s', eigenvectors)print('\n Eigen Values \n%s', eigenvalues)# Make a set of (eigenvalue, eigenvector) pairs:eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalueeig_pairs.sort()eig_pairs.reverse()print(eig_pairs)# Extract the descending ordered eigenvalues and eigenvectorseigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]# Let's confirm our sorting worked, print out eigenvaluesprint('Eigenvalues in descending order: \n%s' %eigvalues_sorted)tot = sum(eigenvalues)var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each # eigen vector... there will be 18 entries as there are 18 eigen vectors)cum_var_exp = np.cumsum(var_explained) # an array of cumulative variance. There will be 18 entries with 18 th entry # cumulative reaching almost 100%#### Plotting The Explained Variance and Princiapl Componentsplt.bar(range(1,19), var_explained, alpha=0.5, align='center', label='individual explained variance')plt.step(range(1,19),cum_var_exp, where= 'mid', label='cumulative explained variance')plt.ylabel('Explained variance ratio')plt.xlabel('Principal components')plt.legend(loc = 'best')plt.show()#### Observation- From above we plot we can clealry observer that 8 dimension() are able to explain 95 %variance of data. - so we will use first 8 principal components going forward and calulate the reduced dimensions. # P_reduce represents reduced mathematical space....P_reduce = np.array(eigvectors_sorted[0:8]) # Reducing from 8 to 4 dimension spaceX_std_8D = np.dot(X_std,P_reduce.T) # projecting original data into principal component dimensionsreduced_pca = pd.DataFrame(X_std_8D) # converting array to dataframe for pairplotreduced_pca# Let us check The Pairplot Of Reduced Dimension After PCAsns.pairplot(reduced_pca, diag_kind='kde') ##### After dimensionality reduction using PCA our attributes have become independent with no correlation among themselves. As most of them have cloud of data points with no lienaer kind of relationship.## Fitting Model and measuring score simply on Original Data #now split the data into 70:30 ratio#orginal DataOrig_X_train,Orig_X_test,Orig_y_train,Orig_y_test = train_test_split(X_std,y,test_size=0.30,random_state=1)#PCA Datapca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca,y,test_size=0.30,random_state=1)#pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca1,y,test_size=0.30,random_state=1)svc = SVC() #instantiate the object#fit the model on orighinal raw datasvc.fit(Orig_X_train,Orig_y_train)#predict the y valueOrig_y_predict = svc.predict(Orig_X_test)#now fit the model on pca data with new dimensionsvc1 = SVC() #instantiate the objectsvc1.fit(pca_X_train,pca_y_train)#predict the y valuepca_y_predict = svc1.predict(pca_X_test)#display accuracy score of both modelsprint("Model Score On Original Data ",svc.score(Orig_X_test, Orig_y_test))print("Model Score On Reduced PCA Dimension ",svc1.score(pca_X_test, pca_y_test))print("Before PCA On Original 18 Dimension",accuracy_score(Orig_y_test,Orig_y_predict))print("After PCA(On 8 dimension)",accuracy_score(pca_y_test,pca_y_predict))On training data set we saw that our support vector classifier without performing PCA has an accuracy score of 95 %But when we applied the SVC model on PCA componenets(reduced dimensions) our model scored 93 %.Considering that original dataframe had 18 dimensions and After PCA dimension reduced to 8, our model has fared well in terms of accuracy score.# Calculate Confusion Matrix & PLot To Visualize itdef draw_confmatrix(y_test, yhat, str1, str2, str3, datatype ): #Make predictions and evalute #model_pred = fit_test_model(model,X_train, y_train, X_test) cm = confusion_matrix( y_test, yhat, [0,1,2] ) print("Confusion Matrix For :", "\n",datatype,cm ) sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [str1, str2,str3] , yticklabels = [str1, str2,str3] ) plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() draw_confmatrix(Orig_y_test, Orig_y_predict,"Van ", "Car ", "Bus", "Original Data Set" )draw_confmatrix(pca_y_test, pca_y_predict,"Van ", "Car ", "Bus", "For Reduced Dimensions Using PCA ")#Classification Report Of Model built on Raw Dataprint("Classification Report For Raw Data:", "\n", classification_report(Orig_y_test,Orig_y_predict))#Classification Report Of Model built on Principal Components:print("Classification Report For PCA:","\n", classification_report(pca_y_test,pca_y_predict))### Confusion Metric Analysis ON Original Data:Confusion Matrix For : Original Data Set - Our model on original data set has correctly classified 58 van out of 59 actuals vans and has errored only in one case where it has wrongly predicted van to be a bus. - IN case of 133 actual cars our svm model has correcly classified 129 cars. it has wrongly classified 3 cars to be a bus and also 1 car to be a van - In case of 62 instances of actual bus , our model has correctly classified 55 buses , It has faltered in classifying wrongly 6 buses to be a van and one bus to be a car. Confusion Matrix For : Original Data Set
xxxxxxxxxx### Confusion Metric Analysis ON Reduced Dimesnion After PCAFor Reduced Dimensions Using PCA:- Out of 59 actual instances of vans our model has correctly predicted 57 vans and errored in 2 instances where it wrongly classified vans to be a car. - Out of 133 actuals cars , our mdoel has correclty classified 126 of them to be a car and faltered in 7 cases where it wrongly classified 5 cars to a bus and 2 cars to be a van. - Out of 62 actual bus , our model has correclty classified 54 of them to be a bus. It has faltered in 8 cases where it wrongly classified 7 bus to be a car and 1 bus to be a van.For Reduced Dimensions Using PCA:
Out of 133 actuals cars , our mdoel has correclty classified 126 of them to be a car and faltered in 7 cases where it wrongly classified 5 cars to a bus and 2 cars to be a van.
Out of 62 actual bus , our model has correclty classified 54 of them to be a bus. It has faltered in 8 cases where it wrongly classified 7 bus to be a car and 1 bus to be a van.
xxxxxxxxxx#### Let's Apply Grid Search & Cross-Validation:To Tune Our Model and Validate The Model's Accuracy Scoreimport itertoolsdef classifiers_hypertune(name,rf,param_grid,x_train_scaled,y_train,x_test_scaled,y_test,CV): CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=CV, verbose= 1, n_jobs =-1 ) CV_rf.fit(x_train_scaled, y_train) y_pred_train = CV_rf.predict(x_train_scaled) y_pred_test = CV_rf.predict(x_test_scaled) print('Best Score: ', CV_rf.best_score_) print('Best Params: ', CV_rf.best_params_) #Classification Report print(name+" Classification Report: ") print(classification_report(y_test, y_pred_test)) #Confusion Matrix for test data draw_confmatrix(y_test, y_pred_test,"Van", "Car", "Bus", "Original Data Set" ) print("SVM Accuracy Score:",round(accuracy_score(y_test, y_pred_test),2)*100)#Training on SVM Classifierfrom sklearn.model_selection import GridSearchCVsvmc = SVC()#Let's See What all parameters one can tweak print("SVM Parameters:", svmc.get_params())# Create the parameter grid based on the results of random search param_grid = [ {'C': [0.01, 0.05, 0.5, 1], 'kernel': ['linear']}, {'C': [0.01, 0.05, 0.5, 1], 'kernel': ['rbf']}, ]param_grid_1 = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ]classifiers_hypertune("Support Vector Classifier", svmc, param_grid,X_train_std_pca, SplitScale_y_train, X_test_std_pca, SplitScale_y_test,10)